import plotly.offline as pyo
# Set notebook mode to work in offline
pyo.init_notebook_mode()
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import geopandas as gpd
import pprint
import numpy as np
#plt.rcParams['figure.figsize'] = [20, 15]
As of Jan 1, 2022, we can find 11 AirBnb data sets on http://insideairbnb.com/get-the-data.html, which were scrapped one time per month between Dec 2020 and Nov 2021, with May 2021 skipped. The 11 datasets where downloaded and put in 11 folders.
Get two reference data:
We choose data of confirmed cases in NSW instead of Australia based on the assumption that the Covid situation in NSW has stronger impact on NSW AirBnb business than the Covid situations outside of NSW.
We choose data of Google Trends queried by users in Australia instead of NSW based on the assumption that NSW AirBnb guests are usually from all over Australia instead of NSW local residents. Opinions on Covid of people all over Australia may influence their decision on whether to book a NSW AirBnb property or not.
The numbers in Google Trends data indicate how the users' "interest" in term Covid changed during the given period. Roughly speaking, the highest frequency of search happening at a timepoint within the period is normalized to 100, and the frequencies happening at rest of the timepoints are normalzied to numbers <= 100, by comparing the frequencies against the highest.
# As of Jan 1, 2022, we can find 11 AirBnb data sets on http://insideairbnb.com/get-the-data.html,
# which were scrapped one time per month between Dec 2020 and Nov 2021, with May 2021 skipped.
# The 11 datasets where downloaded and put in 11 folders. The foler names are shown below.
AIRBNB_FOLDERS = ['data_airbnb_syd/20201214/',
'data_airbnb_syd/20210110/',
'data_airbnb_syd/20210208/',
'data_airbnb_syd/20210304/',
'data_airbnb_syd/20210410/',
'data_airbnb_syd/20210605/',
'data_airbnb_syd/20210705/',
'data_airbnb_syd/20210807/',
'data_airbnb_syd/20210908/',
'data_airbnb_syd/20211009/',
'data_airbnb_syd/20211106/']
# The file of confirmed Covid19 cases in NSW was downladed, and stored at location below
NSW_COVID_CASE_FILE = './data_covid/nsw_health/confirmed_cases_table1_location.csv'
# The file of Google Trends of query term 'Covid' was downloaded, and stored at location below
GOOGLE_TRENDS_COVID_FILE = './data_covid/google_trends/multiTimeline.csv'
In this use case we estimate the number of AirBnb properties in operation and their occupancy rate in the period between Dec, 2020 and Nov, 2021. The data were collected by 11 scrapings, one scraping executed in each month, with May 2021 being skipped. One scraping usually lasted for 1-2 day.
We treat the status of properties at the time they were scraped as a snapshot, and therefore as a sample of their status in usual days. We estimate the number of properties in operation by counting the listed properties at scraping time, and whether the properties were occupied/vacant by checking if they were available for booking at scraping time.
The second estimation is based on a simplistic assumption that "available" means "vacant", and "unavailable" means "occupied". In reality, a property can be unavailable for two reasons, namely it is booked/occupied by guests, or the host of the property blocks booking. We assume the second reason for unavailable is not common, and can be safely ignored in the analysis.
We observe changes of property number and occapancy rate from the dataset. In pursuing the causes of the changes, we use two reference data, New South Wales confirmed Covid case data and Google Trends data on query term Covid.
import os
available_property_nubers = []
unavailable_property_numbers = []
for folder in FOLDERS:
date_label = folder.split('/')[1]
last_scrape_date = pd.read_csv(os.path.join(folder, 'listings_detailed.csv'),
header='infer',
usecols=['id', 'calendar_last_scraped'],
parse_dates=['calendar_last_scraped'])
last_scrape_date = last_scrape_date.rename(columns={'id': 'listing_id',
'calendar_last_scraped': 'date'})
df_cal_temp = pd.read_csv(os.path.join(folder, 'calendar_detailed.csv'),
header='infer',
usecols=['listing_id', 'available', 'date'],
parse_dates=['date']
)
df_cal_on_scrape_day = pd.merge(df_cal_temp, last_scrape_date, how='inner', on=['listing_id', 'date'])
available_property_nubers.append((date_label, (df_cal_on_scrape_day['available']=='t').sum()))
unavailable_property_numbers.append((date_label, (df_cal_on_scrape_day['available']=='f').sum()))
df_property_availability = pd.DataFrame({'date': [d for d, _ in available_property_nubers],
'available': [a for _, a in available_property_nubers],
'unavailable': [u for _, u in unavailable_property_numbers]})
df_property_availability['date'] = pd.to_datetime(df_property_availability['date'])
df_property_availability['property_number'] = df_property_availability['available'] + df_property_availability['unavailable']
df_property_availability['unavailable_ratio'] = df_property_availability['unavailable'] / df_property_availability['property_number']
df_property_availability['available_ratio'] = df_property_availability['available'] / df_property_availability['property_number']
df_nsw_cases = pd.read_csv(NSW_COVID_CASE_FILE, header='infer', parse_dates=['notification_date'])
df_nsw_cases_count = df_nsw_cases[['notification_date']]
df_nsw_cases_count = df_nsw_cases_count.groupby('notification_date').size().reset_index(name='counts')
df_nsw_cases_count = df_nsw_cases_count.loc[(df_nsw_cases_count['notification_date'] >= '2020-11-01') & (df_nsw_cases_count['notification_date'] <= '2021-11-30')]
df_trends = pd.read_csv(GOOGLE_TRENDS_COVID_FILE, header=1, parse_dates=['Week'])
#df_trends['covid: (Australia)'] = df_trends['covid: (Australia)'].str.replace(r'<1', '0')
df_trends['covid: (Australia)'] = df_trends['covid: (Australia)'].apply(pd.to_numeric) # latest version of plotly has a bug, needing explicitly changing object to numeric
df_trends = df_trends.loc[(df_trends['Week'] >= '2020-11-1') & (df_trends['Week'] <= '2021-11-30')]
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# Create figure with secondary y-axis
fig = make_subplots(rows=2,
cols=1,
specs=[[{"secondary_y": True}],
[{"secondary_y": True}]
],
shared_xaxes=True,
vertical_spacing=0.02,
#subplot_titles=("NSW AirBnB Listing, Vacancy V.S. Confirmed Case, Google trends <I>Covid<I>")
)
# Add traces
fig.add_trace(
go.Bar(x=df_property_availability['date'], y=df_property_availability['property_number'], name="Listed properties"),
secondary_y=False,
row=1,
col=1
)
fig.add_trace(
go.Bar(x=df_property_availability['date'], y=df_property_availability['available'], name="Vacant (~available) properties of AirBnb in NSW"),
secondary_y=False,
row=1,
col=1
)
fig.add_trace(
go.Scatter(x=df_property_availability['date'], y=df_property_availability['available_ratio'], name="Vacancy (~available) rate of AirBnb in NSW"),
secondary_y=True,
row=1,
col=1
)
# Set x-axis title
#fig.update_xaxes(title_text="Sampling Date", row=1, col=1)
# Set y-axes titles
fig.update_yaxes(title_text="Property Number (log)", type='log', secondary_y=False, row=1, col=1)
fig.update_yaxes(title_text="Vacancy (~Available) Rate", range=[0, 0.5], secondary_y=True, row=1, col=1)
# Add traces
fig.add_trace(
go.Bar(x=df_nsw_cases_count['notification_date'], y=df_nsw_cases_count['counts'], name="Confirmed cases in NSW"),
secondary_y=False,
row=2,
col=1
)
fig.add_trace(
go.Scatter(x=df_trends['Week'], y=df_trends['covid: (Australia)'], name="Australian users' interest in <I>Covid</I> (Google Trends)"),
secondary_y=True,
row=2,
col=1
)
# Set x-axis title
#fig.update_xaxes(title_text="Date", row=2, col=1)
# Set y-axes titles
fig.update_yaxes(title_text="Confirmed case (log)", type='log', secondary_y=False, row=2, col=1)
fig.update_yaxes(title_text="Australian users' interest in <I>Covid</I> on Google", secondary_y=True, row=2, col=1)
# Add figure title
fig.update_layout(
title_text="NSW AirBnB Listing, Vacancy V.S. Confirmed Case, Google trends <I>Covid</I>",
height=800
)
# Set x-axis title
#fig.update_xaxes(title_text="Date")
# Set y-axes titles
#fig.update_yaxes(title_text="NSW Google search term Covid", type='log', secondary_y=True)
#fig.update_yaxes(title_text="NSW confirmed case number (Logorithm)", type='log', secondary_y=False)
fig.show()
An initial and preliminary analysis of the diagram suggests:
TBD
TBD
df_listings_summary_20211106.head()
#df_listings_summary_20211106.describe(include='all')
df_listings_summary_20211106['id'].nunique()
20513
df_listings_summary_20211106.query("availability_365 == 365").nunique()
df_listings_summary_20211106.query("availability_365 == 364").nunique()
for k, v in df_bnb_listings_detailed.iloc[0].iteritems():
print(k,'\n' ,v)
df_bnb_listings_detailed.info()
df_bnb_cal_detailed.query("listing_id==11156")
df_bnb_cal_detailed.iloc[0]
df_listing_neighbourhood = df_listings_summary_20211106.groupby('neighbourhood')['id', 'host_id'].nunique()
df_listing_neighbourhood.reset_index(inplace=True)
#df_listing_neighbourhood
#fig, ax = plt.subplots(figsize=(18, 8))
#plt.xticks(rotation=75)
#ax = sns.barplot(data=df_listing_neighbourhood,
# x='neighbourhood', y='id')
#fig = px.bar(df_listing_neighbourhood,
# x='neighbourhood',
# y='id',
# hover_data=['id', 'host_id'],
# labels={'id': 'number of properties', 'host_id': 'number of hosts'},
# title='Number of properties in neighbourhoods',
# color='id',
# height=800
# )
#fig.show()
#syd_geojson = gpd.read_file('./data_airbnb_syd/20211106/neighbourhoods.geojson')
#syd_geojson
#syd_geojson_joined = syd_geojson.merge(df_listing_neighbourhood, on='neighbourhood')
#syd_geojson_joined
#fig, axes = plt.subplots(2, figsize=(40, 25))
#syd_geojson_joined.plot(column='id',
# ax=axes[0],
# legend=True,
# legend_kwds={'label': 'Number of properties', 'orientation': 'vertical'})
#
#syd_geojson_joined.plot(column='host_id',
# ax=axes[1],
# legend=True,
# legend_kwds={'label': 'Number of hosts', 'orientation': 'vertical'})
#syd_geojson_joined.boundary.plot()
#syd_geojson_joined.centroid.plot()
#syd_geojson_joined.explore(column='id',
# popup=['neighbourhood', 'id', 'host_id'],
# popup_kwds={'aliases': [('neighbourhood'), ('property number'), ('host number')]},
# tooltip=['neighbourhood', 'id', 'host_id'],
# tooltip_kwds={'aliases': [('neighbourhood'), ('property number'), ('host number')]},
# legend=True,
# legend_kwds={'caption': 'Number of properties'})
#syd_geojson_joined.explore(column='host_id',
# popup=['neighbourhood', 'id', 'host_id'],
# popup_kwds={'aliases': [('neighbourhood'), ('property number'), ('host number')]},
# tooltip=['neighbourhood', 'id', 'host_id'],
# tooltip_kwds={'aliases': [('neighbourhood'), ('property number'), ('host number')]},
# legend=True,
# legend_kwds={'caption': 'Number of hosts'},
# k=2)
gpd.tools.geocode(syd_geojson['neighbourhood'])
fig = px.violin(df_listings_summary_20211106,
y="price",
x="neighbourhood",
color="neighbourhood",
box=True,
points='outliers',#points="all",
hover_data=['neighbourhood', 'price'],
#width=2000,
#height=2000,
title='Price in neighbourhoods'
)
fig.show()
df_listing_20211106_price = df_listings_summary_20211106.groupby('neighbourhood')['price'].agg([np.min, np.max, np.mean, np.median])
df_listing_20211106_price = df_listing_20211106_price.reset_index()
df_listing_20211106_price
df_listing_20211106_price.iloc[0]
#fig = px.choropleth(df_listing_20211106_price,
# geojson=syd_geojson,
# locations='neighbourhood',
# featureidkey='properties.neighbourhood',
# color='median',
# color_continuous_scale='Viridis',
# hover_name='neighbourhood',
# hover_data=['median', 'mean']
# )
#fig.update_layout(margin={"r":0, "t":0, "l": 0, "b": 0})
#fig.update_geos(fitbounds="locations")
#fig.show()
#dir(syd_geojson.centroid[0])
print(syd_geojson.centroid[0].coords[0])
## px.choropleth_mapbox does not calculate center and zoom automatically.
## randomly select one point within relevant area as the center of map.
#center_coords = syd_geojson.centroid[0].coords[0] # note: it is in format (point.x, point.y), so (longtitude, latitude)
#
#fig = px.choropleth_mapbox(df_listing_20211106_price,
# geojson=syd_geojson,
# locations='neighbourhood',
# featureidkey='properties.neighbourhood',
# color='median',
# color_continuous_scale='Viridis',
# hover_name='neighbourhood',
# hover_data=['median', 'mean'],
# mapbox_style="open-street-map",
# center={'lat': center_coords[1], 'lon': center_coords[0]}
# )
#fig.update_layout(margin={"r":0, "t":0, "l": 0, "b": 0})
#fig.update_geos(fitbounds="locations", visible=False) ## it is not working on px.choropleth_mapbox. need to set center manually
#
#fig.show()
#center_coords = syd_geojson.centroid[0].coords[0]
#
#fig = px.choropleth_mapbox(df_listing_20211106_price,
# geojson=syd_geojson,
# locations='neighbourhood',
# featureidkey='properties.neighbourhood',
# color='amax',
# color_continuous_scale="Viridis",
# hover_name='neighbourhood',
# hover_data=['amax', 'amin'],
# center={'lat': center_coords[1], 'lon': center_coords[0]}
# )
#fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
#fig.update_layout(mapbox_style="open-street-map")
#fig.update_geos(fitbounds="locations", visible=False)## it is not working on px.choropleth_mapbox. need to set center manually
# Note: if you put fig.update_layout() below instead of above (before fig.update_geos),
# no data will be shown on map.
#fig.update_layout(mapbox_style="open-street-map")
#fig.show()
import os
df_list = []
for folder in FOLDERS:
date_label = folder.split('/')[1]
file_path = os.path.join(folder, 'listings.csv')
df_temp = pd.read_csv(file_path, header='infer')
df_temp = df_temp.groupby('neighbourhood')['price'].agg([np.min, np.max, np.mean, np.median])
df_temp = df_temp.reset_index()
df_temp = df_temp.assign(listing_date=date_label)
df_list.append(df_temp)
df_pricing = pd.concat(df_list)
df_pricing
df_pricing['listing_date_form'] = pd.to_datetime(df_pricing['listing_date'])
df_pricing['listing_date_form'] = df_pricing['listing_date_form'].astype(str)
df_pricing.sort_values('listing_date_form')
df_pricing
df_pricing.info()
fig = px.bar(df_pricing,
x="neighbourhood",
y="median",
color="neighbourhood",
animation_frame="listing_date_form",
animation_group='neighbourhood',
range_x=[0,37],
range_y=[0,600])
fig["layout"].pop("updatemenus") # optional, drop animation buttons
fig.show()
fig = px.bar(df_pricing,
x="neighbourhood",
y="amin",
color="neighbourhood",
animation_frame="listing_date_form",
animation_group='neighbourhood',
range_x=[0,37],
range_y=[0,100])
fig["layout"].pop("updatemenus") # optional, drop animation buttons
fig.show()
fig = px.bar(df_pricing,
x="neighbourhood",
y="amax",
color="neighbourhood",
animation_frame="listing_date_form",
animation_group='neighbourhood',
range_x=[0,37],
range_y=[1,100000],
log_y=True
)
fig["layout"].pop("updatemenus") # optional, drop animation buttons
fig.show()
df_listings_summary_20201214 = pd.read_csv("./data_airbnb_syd/20201214/listings.csv", header='infer')
df_listings_summary_20201214['id'].unique().shape
print('number of properties {}'.format(df_listings_summary_20201214['id'].unique().shape[0]))
print('number of hosts {}'.format(df_listings_summary_20201214['host_id'].unique().shape[0]))
import os
property_numbers = []
host_numbers = []
for folder in FOLDERS:
date_label = folder.split('/')[1]
file_path = os.path.join(folder, 'listings.csv')
df_temp = pd.read_csv(file_path, header='infer')
property_numbers.append((date_label, df_temp['id'].unique().shape[0]))
host_numbers.append((date_label, df_temp['host_id'].unique().shape[0]))
property_numbers
host_numbers
df_property_host_numbers = pd.DataFrame({'date': [d for d, _ in property_numbers],
'property_number': [n for _, n in property_numbers],
'host_number': [n for _, n in host_numbers]})
df_property_host_numbers
| date | property_number | host_number | |
|---|---|---|---|
| 0 | 20201214 | 33871 | 25321 |
| 1 | 20210110 | 33902 | 25097 |
| 2 | 20210208 | 33630 | 24965 |
| 3 | 20210304 | 33229 | 24759 |
| 4 | 20210410 | 33229 | 24759 |
| 5 | 20210605 | 32079 | 24055 |
| 6 | 20210705 | 31899 | 23913 |
| 7 | 20210807 | 31764 | 23799 |
| 8 | 20210908 | 31030 | 23467 |
| 9 | 20211009 | 30223 | 22980 |
| 10 | 20211106 | 20513 | 14977 |
#import matplotlib.pyplot as plt
#plt.xticks(rotation=45)
#plt.title('Property and Host numbers 2020 Dec - 2021 Nov')
#
#plt.plot(df_property_host_numbers['date'], df_property_host_numbers['property_number'], label='Property Number')
#plt.plot(df_property_host_numbers['date'], df_property_host_numbers['host_number'], label='Host Number')
#
#plt.legend()
#sns.relplot(data=df_property_host_numbers, kind='line')
#pd.melt(df_property_host_numbers,
# id_vars=['date'],
# value_vars=['property_number', 'host_number'],
# var_name='number',
# )
#sns.relplot(data=pd.melt(df_property_host_numbers,
# id_vars=['date'],
# value_vars=['property_number', 'host_number'],
# var_name='type',
# value_name='number'),
# x='date',
# y='number',
# hue='type',
# kind='line')
fig, ax = plt.subplots()
plt.title('Property and Host numbers 2020 Dec - 2021 Nov')
plt.xticks(rotation=45)
ax = sns.lineplot(data=df_property_host_numbers, x='date', y='property_number', label='Property Number')
ax1 = sns.lineplot(data=df_property_host_numbers, x='date', y='host_number', label='Host Number')
plt.xlabel('date')
plt.ylabel('number')
plt.legend()
df_listings_detailed_20211106 = pd.read_csv("./data_airbnb_syd/20211106/listings_detailed.csv", header='infer')
df_listings_detailed_20211106.head()
df_listings_detailed_20211106.info()
df_listings_detailed_20211106.iloc[0]
df_listings_detailed_20211106['calendar_last_scraped'].unique()
array(['2021-11-07', '2021-11-06'], dtype=object)
df_neighbourhoods_csv = pd.read_csv('./data_airbnb_syd/20210110/neighbourhoods.csv', header='infer')
df_neighbourhoods_csv.head()
df_neighbourhoods_csv.count()
df_neighbourhoods_csv
df_neighbourhoods_geo = pd.read_json('./data_airbnb_syd/20210110/neighbourhoods.geojson')
df_neighbourhoods_geo.head()
print(df_neighbourhoods_geo.iloc[0])
print(df_neighbourhoods_geo.iloc[0]['features'])